Download Excel file from: https://data.gov.uk/dataset/road-accidents-safety-data/resource/80b76aec-a0a1-4e14-8235-09cc6b92574a

Excel (or OpenOffice) can't open it, since it's too large. Can Python help?

Note: If code freezes, kill Ipython and try again.


In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("Accidents7904.csv")


C:\st\Anaconda3\lib\site-packages\pandas\io\parsers.py:1170: DtypeWarning: Columns (13,31) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)

In [3]:
len(data)


Out[3]:
6224198

In [5]:
data.columns


Out[5]:
Index(['Accident_Index', 'Location_Easting_OSGR', 'Location_Northing_OSGR',
       'Longitude', 'Latitude', 'Police_Force', 'Accident_Severity',
       'Number_of_Vehicles', 'Number_of_Casualties', 'Date', 'Day_of_Week',
       'Time', 'Local_Authority_(District)', 'Local_Authority_(Highway)',
       '1st_Road_Class', '1st_Road_Number', 'Road_Type', 'Speed_limit',
       'Junction_Detail', 'Junction_Control', '2nd_Road_Class',
       '2nd_Road_Number', 'Pedestrian_Crossing-Human_Control',
       'Pedestrian_Crossing-Physical_Facilities', 'Light_Conditions',
       'Weather_Conditions', 'Road_Surface_Conditions',
       'Special_Conditions_at_Site', 'Carriageway_Hazards',
       'Urban_or_Rural_Area', 'Did_Police_Officer_Attend_Scene_of_Accident',
       'LSOA_of_Accident_Location'],
      dtype='object')

In [7]:
data_sunday = data[data.Day_of_Week == 1]
len(data_sunday)


Out[7]:
693847

In [12]:
d = (data[data["Day_of_Week"]])
len(d)


---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-12-8f873d22984c> in <module>()
----> 1 d = (data[data["Day_of_Week"]])
      2 len(d)

C:\st\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
   1789         if isinstance(key, (Series, np.ndarray, Index, list)):
   1790             # either boolean or fancy integer index
-> 1791             return self._getitem_array(key)
   1792         elif isinstance(key, DataFrame):
   1793             return self._getitem_frame(key)

C:\st\Anaconda3\lib\site-packages\pandas\core\frame.py in _getitem_array(self, key)
   1834         else:
   1835             indexer = self.ix._convert_to_indexer(key, axis=1)
-> 1836             return self.take(indexer, axis=1, convert=True)
   1837 
   1838     def _getitem_multilevel(self, key):

C:\st\Anaconda3\lib\site-packages\pandas\core\generic.py in take(self, indices, axis, convert, is_copy)
   1356         new_data = self._data.take(indices,
   1357                                    axis=self._get_block_manager_axis(axis),
-> 1358                                    convert=True, verify=True)
   1359         result = self._constructor(new_data).__finalize__(self)
   1360 

C:\st\Anaconda3\lib\site-packages\pandas\core\internals.py in take(self, indexer, axis, verify, convert)
   3273         new_labels = self.axes[axis].take(indexer)
   3274         return self.reindex_indexer(new_axis=new_labels, indexer=indexer,
-> 3275                                     axis=axis, allow_dups=True)
   3276 
   3277     def merge(self, other, lsuffix='', rsuffix=''):

C:\st\Anaconda3\lib\site-packages\pandas\core\internals.py in reindex_indexer(self, new_axis, indexer, axis, fill_value, allow_dups, copy)
   3155         if axis == 0:
   3156             new_blocks = self._slice_take_blocks_ax0(
-> 3157                 indexer, fill_tuple=(fill_value,))
   3158         else:
   3159             new_blocks = [blk.take_nd(indexer, axis=axis,

C:\st\Anaconda3\lib\site-packages\pandas\core\internals.py in _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple)
   3236                     blocks.append(blk.take_nd(
   3237                         blklocs[mgr_locs.indexer], axis=0,
-> 3238                         new_mgr_locs=mgr_locs, fill_tuple=None))
   3239 
   3240         return blocks

C:\st\Anaconda3\lib\site-packages\pandas\core\internals.py in take_nd(self, indexer, axis, new_mgr_locs, fill_tuple)
    851             fill_value = self.fill_value
    852             new_values = com.take_nd(self.get_values(), indexer, axis=axis,
--> 853                                      allow_fill=False)
    854         else:
    855             fill_value = fill_tuple[0]

C:\st\Anaconda3\lib\site-packages\pandas\core\common.py in take_nd(arr, indexer, axis, out, fill_value, mask_info, allow_fill)
    836             out = np.empty(out_shape, dtype=dtype, order='F')
    837         else:
--> 838             out = np.empty(out_shape, dtype=dtype)
    839 
    840     func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype,

MemoryError: 

In [ ]: